this Rmarkdown is the final version for RAISIN Smart-Seq2 data analysis
first load all needed packages/functions which are integrated in script 'analysis.r'
most of the script are from example code https://github.com/cssmillie/ulcerative_colitis, and modified to fit local environment and current datasets
source("analysis.r")
allmeta <- read.table(paste0(raw_dir,"all.meta.txt"), header = T, sep = "\t")
allmeta <- allmeta[2:nrow(allmeta),]
dim(allmeta)
## [1] 586460 16
cat(paste0("cells with nGene > 1000: \n",sum(allmeta$nGene > 1000),
"\n\ncells with nGene >= 1000: \n",sum(allmeta$nGene >= 1000)))
## cells with nGene > 1000:
## 585917
##
## cells with nGene >= 1000:
## 586460
this allmeta has information for all filtered nuclei, total 586,460 cells (nGene >= 1000)
sub-slusters in paper are marked under column 'Annotation'
head(allmeta)
## NAME Age Annotation Dataset
## 2 H10_S46.GGAACTTCAAGAAGAG 35 Fibroblast_2 Human colon all cells (10X)
## 3 H10_S46.CCGTACTCACCTCGGA 35 Epithelial Human colon all cells (10X)
## 4 H10_S46.GATGCTAAGAGCAATT 35 Fibroblast_1 Human colon all cells (10X)
## 5 H10_S46.CCCAGTTAGGGATCTG 35 Myocyte_3 Human colon all cells (10X)
## 6 H10_S46.GGATTACCAGGATTGG 35 Myocyte_2 Human colon all cells (10X)
## 7 H10_S46.CAGAATCTCGCGGATC 35 Fibroblast_2 Human colon all cells (10X)
## Location_ID Mouse_ID Overload Patient_ID Region Segment Sex Time Type
## 2 Sigmoid <NA> 1X H10 Myenteric <NA> M <NA> <NA>
## 3 Sigmoid <NA> 1X H10 Myenteric <NA> M <NA> <NA>
## 4 Sigmoid <NA> 1X H10 Myenteric <NA> M <NA> <NA>
## 5 Sigmoid <NA> 1X H10 Myenteric <NA> M <NA> <NA>
## 6 Sigmoid <NA> 1X H10 Myenteric <NA> M <NA> <NA>
## 7 Sigmoid <NA> 1X H10 Myenteric <NA> M <NA> <NA>
## Unique_ID nGene nUMI
## 2 S46 1166 1563
## 3 S46 1554 2061
## 4 S46 1335 1760
## 5 S46 1008 1367
## 6 S46 1066 1398
## 7 S46 1761 2437
column names of allmeta
colnames(allmeta)
## [1] "NAME" "Age" "Annotation" "Dataset" "Location_ID"
## [6] "Mouse_ID" "Overload" "Patient_ID" "Region" "Segment"
## [11] "Sex" "Time" "Type" "Unique_ID" "nGene"
## [16] "nUMI"
using 'Dataset' could get a stat for each dataset
data.frame(table(allmeta$Dataset))
## Var1 Freq
## 1 Human colon all cells (10X) 146442
## 2 Human colon enteric glia (10X) 6054
## 3 Human colon enteric neurons (10X) 1445
## 4 Mouse colon all cells (10X) 343000
## 5 Mouse colon enteric glia (10X) 1690
## 6 Mouse colon enteric glia (Smart-Seq2) 3039
## 7 Mouse colon enteric neurons (10X) 1938
## 8 Mouse colon enteric neurons (Smart-Seq2) 2657
## 9 Mouse ileum all cells (10X) 79293
## 10 Mouse ileum enteric glia (10X) 429
## 11 Mouse ileum enteric neurons (10X) 473
how about Smart-Seq2 datasets
# for this small amount of lines, could directly use
# data.frame(table(allmeta$Dataset))[c(6,8),]
data.frame(table(allmeta$Dataset))[grep("Smart-Seq2",as.character(data.frame(table(allmeta$Dataset))$Var1)), ]
## Var1 Freq
## 6 Mouse colon enteric glia (Smart-Seq2) 3039
## 8 Mouse colon enteric neurons (Smart-Seq2) 2657
these '3039' glia and '2657' neurons of Mouse colon were first labeled in mice, then FACS sorted into plates,
so no 'all cells' as other droplet-datasets
subset allmeta to _ss2
allmeta_ss2 <- allmeta[allmeta$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)", "Mouse colon enteric neurons (Smart-Seq2)"),]
# continue to partition
allmeta_ss2_neur <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric neurons (Smart-Seq2)"),]
allmeta_ss2_glia <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)"),]
# check
cat("dimention of allmeta_ss2: \n",dim(allmeta_ss2),
"\n\ndimention of allmeta_ss2_neur: \n",dim(allmeta_ss2_neur),
"\n\ndimention of allmeta_ss2_glia: \n",dim(allmeta_ss2_glia))
## dimention of allmeta_ss2:
## 5696 16
##
## dimention of allmeta_ss2_neur:
## 2657 16
##
## dimention of allmeta_ss2_glia:
## 3039 16
head(allmeta_ss2)
## NAME Age Annotation Dataset
## 572593 M1_S1.S339 Young PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572594 M1_S1.S351 Young PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572595 M1_S1.S363 Young PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572596 M1_S1.S349 Young PIMN_5 Mouse colon enteric neurons (Smart-Seq2)
## 572597 M1_S1.S313 Young PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572598 M1_S1.S289 Young PIN_1 Mouse colon enteric neurons (Smart-Seq2)
## Location_ID Mouse_ID Overload Patient_ID Region Segment Sex Time Type
## 572593 <NA> M1 <NA> <NA> <NA> <NA> F <NA> Sox10
## 572594 <NA> M1 <NA> <NA> <NA> <NA> F <NA> Sox10
## 572595 <NA> M1 <NA> <NA> <NA> <NA> F <NA> Sox10
## 572596 <NA> M1 <NA> <NA> <NA> <NA> F <NA> Sox10
## 572597 <NA> M1 <NA> <NA> <NA> <NA> F <NA> Sox10
## 572598 <NA> M1 <NA> <NA> <NA> <NA> F <NA> Sox10
## Unique_ID nGene nUMI
## 572593 S1 7060 1777551.53
## 572594 S1 7419 1435340.23
## 572595 S1 7674 1483155.55
## 572596 S1 8018 1505426.6
## 572597 S1 6985 1412616.17
## 572598 S1 7945 1210014.21
#tail(allmeta_ss2)
#head(allmeta_ss2_neur)
#head(allmeta_ss2_glia)
allmeta_ss2 is incomplete for all conditions, need to recover them from table mmc2.xlsx 'Mous Colon (plate)'
mmc2_ss2 <- read.table("../test_4_for_RAISIN/mmc2_1.csv",header=TRUE,sep = ",")
mmc2_ss2[1:10,]
## Mouse_ID Sample_ID Age Age_Processed Model Model_Processed Segment
## 1 M1 S1 12 Young Sox10 Sox10 All
## 2 M2 S2 12 Young Sox10 Sox10 All
## 3 M3 S3 12 Young Sox10 Sox10 All
## 4 M4 S4 12 Young Sox10 Sox10 All
## 5 M5 S5 12 Young Sox10 Sox10 All
## 6 M6 S6 12 Young Sox10 Sox10 All
## 7 M7 S7 12 Young Sox10 Sox10 1
## 8 M7 S8 12 Young Sox10 Sox10 2
## 9 M7 S9 12 Young Sox10 Sox10 3
## 10 M7 S10 12 Young Sox10 Sox10 4
## Segment_Processed Sex Sex_Processed Time Time_Processed Count_Neuron
## 1 <NA> F F <NA> <NA> 10
## 2 <NA> M M <NA> <NA> 4
## 3 <NA> M M <NA> <NA> 4
## 4 <NA> M M <NA> <NA> 23
## 5 <NA> F F 2PM <NA> 14
## 6 <NA> M M 7AM 7AM 15
## 7 Proximal F F 7AM 7AM 17
## 8 Proximal F F 7AM 7AM 10
## 9 Distal F F 7AM 7AM 9
## 10 Distal F F 7AM 7AM 17
## nGene_Neuron nUMI_Neuron Count_Glia nGene_Glia nUMI_Glia
## 1 7258.900 1365085.3 14 4827.071 1089717.3
## 2 7641.750 1177631.3 21 5194.619 952311.3
## 3 6442.750 1078223.6 16 5170.312 1067908.4
## 4 6966.565 905060.1 51 4833.804 709912.2
## 5 7773.357 1447690.7 54 4910.370 756925.7
## 6 7475.600 1111227.2 31 5111.355 943599.6
## 7 7585.529 1001655.7 50 5077.520 679572.9
## 8 7349.900 915390.5 32 5017.406 843839.6
## 9 7807.222 1201613.6 30 5156.567 842031.2
## 10 7602.647 1183935.2 35 5294.057 906308.8
depending on allmeta_ss2:Unique_ID = mmc_ss2:Sample_ID
# allmeta_ss2 just keep these columns, conditions all use mmc2's
allmeta_ss2 <- allmeta_ss2[,c("NAME","Annotation","Dataset","Mouse_ID","Unique_ID","nGene","nUMI")]
# for same Sample_ID, copy mmc2 columns 3-12, i.e. Age-Time_Processed
for(i in 1:nrow(mmc2_ss2)){
allmeta_ss2[allmeta_ss2$Unique_ID == mmc2_ss2$Sample_ID[i], colnames(mmc2_ss2)[3:12]] <- mmc2_ss2[i,3:12]
}
rownames(allmeta_ss2) <- allmeta_ss2$NAME
# repartition
allmeta_ss2_neur <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric neurons (Smart-Seq2)"),]
allmeta_ss2_glia <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)"),]
allmeta_ss2[1001:1005,]
## NAME Annotation Dataset
## M19_S60.S304 M19_S60.S304 PIMN_1 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S296 M19_S60.S296 PSVN_1 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S313 M19_S60.S313 PIMN_1 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S321 M19_S60.S321 PEMN_2 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S318 M19_S60.S318 PIN_1 Mouse colon enteric neurons (Smart-Seq2)
## Mouse_ID Unique_ID nGene nUMI Age Age_Processed Model
## M19_S60.S304 M19 S60 11385 2034421.15 12 Young Uchl1
## M19_S60.S296 M19 S60 9780 2017089.15 12 Young Uchl1
## M19_S60.S313 M19 S60 10202 2249290.68 12 Young Uchl1
## M19_S60.S321 M19 S60 10170 2388613.29 12 Young Uchl1
## M19_S60.S318 M19 S60 9729 2264351.92 12 Young Uchl1
## Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M19_S60.S304 Uchl1 4 Distal M M 7AM
## M19_S60.S296 Uchl1 4 Distal M M 7AM
## M19_S60.S313 Uchl1 4 Distal M M 7AM
## M19_S60.S321 Uchl1 4 Distal M M 7AM
## M19_S60.S318 Uchl1 4 Distal M M 7AM
## Time_Processed
## M19_S60.S304 7AM
## M19_S60.S296 7AM
## M19_S60.S313 7AM
## M19_S60.S321 7AM
## M19_S60.S318 7AM
allmeta_ss2_neur[501:505,]
## NAME Annotation Dataset
## M13_S36.S171 M13_S36.S171 PIMN_6 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S124 M13_S36.S124 PIMN_6 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S97 M13_S36.S97 PIMN_5 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S164 M13_S36.S164 PSN_3 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S142 M13_S36.S142 PEMN_3 Mouse colon enteric neurons (Smart-Seq2)
## Mouse_ID Unique_ID nGene nUMI Age Age_Processed Model
## M13_S36.S171 M13 S36 7868 1176891.69 12 Young Sox10
## M13_S36.S124 M13 S36 7279 658859.28 12 Young Sox10
## M13_S36.S97 M13 S36 8132 1621489.91 12 Young Sox10
## M13_S36.S164 M13 S36 6134 637731.59 12 Young Sox10
## M13_S36.S142 M13 S36 6827 999976.02 12 Young Sox10
## Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M13_S36.S171 Sox10 2 Proximal M M 7PM
## M13_S36.S124 Sox10 2 Proximal M M 7PM
## M13_S36.S97 Sox10 2 Proximal M M 7PM
## M13_S36.S164 Sox10 2 Proximal M M 7PM
## M13_S36.S142 Sox10 2 Proximal M M 7PM
## Time_Processed
## M13_S36.S171 7PM
## M13_S36.S124 7PM
## M13_S36.S97 7PM
## M13_S36.S164 7PM
## M13_S36.S142 7PM
allmeta_ss2_glia[1301:1305,]
## NAME Annotation Dataset
## M12_S32.S120 M12_S32.S120 Glia_1 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S109 M12_S32.S109 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S126 M12_S32.S126 Glia_1 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S138 M12_S32.S138 Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S167 M12_S32.S167 Glia_1 Mouse colon enteric glia (Smart-Seq2)
## Mouse_ID Unique_ID nGene nUMI Age Age_Processed Model
## M12_S32.S120 M12 S32 3415 746971.26 12 Young Sox10
## M12_S32.S109 M12 S32 4856 951135.46 12 Young Sox10
## M12_S32.S126 M12 S32 4330 598552.18 12 Young Sox10
## M12_S32.S138 M12 S32 5014 637799.64 12 Young Sox10
## M12_S32.S167 M12 S32 4782 870248.93 12 Young Sox10
## Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M12_S32.S120 Sox10 2 Proximal M M 7PM
## M12_S32.S109 Sox10 2 Proximal M M 7PM
## M12_S32.S126 Sox10 2 Proximal M M 7PM
## M12_S32.S138 Sox10 2 Proximal M M 7PM
## M12_S32.S167 Sox10 2 Proximal M M 7PM
## Time_Processed
## M12_S32.S120 7PM
## M12_S32.S109 7PM
## M12_S32.S126 7PM
## M12_S32.S138 7PM
## M12_S32.S167 7PM
load DGE matrix
ss2_neur.counts <- readMM(paste0(raw_dir,"DGE/gene_sorted-ss2.neur.matrix.mtx"))
rownames(ss2_neur.counts) <- readLines(paste0(raw_dir,"DGE/ss2.neur.genes.tsv"))
colnames(ss2_neur.counts) <- readLines(paste0(raw_dir,"DGE/ss2.neur.barcodes.tsv"))
ss2_glia.counts <- readMM(paste0(raw_dir,"DGE/gene_sorted-ss2.glia.matrix.mtx"))
rownames(ss2_glia.counts) <- readLines(paste0(raw_dir,"DGE/ss2.glia.genes.tsv"))
colnames(ss2_glia.counts) <- readLines(paste0(raw_dir,"DGE/ss2.glia.barcodes.tsv"))
check the expression
ss2_neur.counts[6:10,1500:1505]
## 5 x 6 sparse Matrix of class "dgTMatrix"
## Atlas_M22_S71.S5 Atlas_M22_S71.S18 Atlas_M22_S71.S30
## 0610009B22Rik . . .
## 0610009D07Rik . 5.01 .
## 0610009L18Rik . . .
## 0610009O20Rik 1.00 83.00 1.00
## 0610010B08Rik 328.18 153.61 30.85
## Atlas_M22_S71.S7 Atlas_M22_S72.S161 Atlas_M22_S72.S139
## 0610009B22Rik 2.00 . .
## 0610009D07Rik 1.00 . .
## 0610009L18Rik . . .
## 0610009O20Rik 13.00 105.0 .
## 0610010B08Rik 105.45 74.7 50.78
ss2_glia.counts[6:10,1400:1405]
## 5 x 6 sparse Matrix of class "dgTMatrix"
## Atlas_M12_S34.S377 Atlas_M12_S34.S336 Atlas_M12_S34.S308
## 0610009B22Rik . . .
## 0610009D07Rik . . 1.57
## 0610009L18Rik . . .
## 0610009O20Rik . . 0.95
## 0610010B08Rik 10.49 399.33 12.43
## Atlas_M12_S34.S314 Atlas_M12_S34.S315 Atlas_M12_S34.S290
## 0610009B22Rik . . 55.00
## 0610009D07Rik . . 619.97
## 0610009L18Rik . . .
## 0610009O20Rik . . .
## 0610010B08Rik 2.51 4.99 1095.62
to be convenient, trying to unify NAME in allmeta and colnames of .counts, need to remove those 'Atlas_',
but get 'length not match' error, then find a few cells with 'Aging_' start instead of 'Atlas_',
and those cells are all from so-called 'old mice' with Age 52/104/105 weeks
check first 6 'Aging_' cells
ss2_neur.counts[6:10,grep("Aging_",colnames(ss2_neur.counts))[1:6]]
## 5 x 6 sparse Matrix of class "dgTMatrix"
## Aging_M16_S45.S52 Aging_M16_S45.S14 Aging_M16_S45.S39
## 0610009B22Rik 1.00 . .
## 0610009D07Rik . . .
## 0610009L18Rik . . .
## 0610009O20Rik . . .
## 0610010B08Rik 360.99 165.77 328.02
## Aging_M16_S45.S83 Aging_M16_S45.S37 Aging_M16_S45.S63
## 0610009B22Rik . . .
## 0610009D07Rik . . .
## 0610009L18Rik . . .
## 0610009O20Rik . . .
## 0610010B08Rik 377.23 67.82 62.18
allmeta_ss2_neur[grep("Aging_",colnames(ss2_neur.counts))[1:6],c("Age","Age_Processed")]
## Age Age_Processed
## M16_S45.S52 52 Old
## M16_S45.S14 52 Old
## M16_S45.S39 52 Old
## M16_S45.S83 52 Old
## M16_S45.S37 52 Old
## M16_S45.S63 52 Old
remove 'Atlas_' or 'Aging_' in front of colnames of .counts
demo
colnames(ss2_neur.counts)[1:5];as.character(sapply(colnames(ss2_neur.counts)[1:5],function(x){gsub("Atlas_|Aging_","",x)}))
## [1] "Atlas_M1_S1.S339" "Atlas_M1_S1.S351" "Atlas_M1_S1.S363" "Atlas_M1_S1.S349"
## [5] "Atlas_M1_S1.S313"
## [1] "M1_S1.S339" "M1_S1.S351" "M1_S1.S363" "M1_S1.S349" "M1_S1.S313"
for all
colnames(ss2_neur.counts) <- as.character(sapply(colnames(ss2_neur.counts),function(x){gsub("Atlas_|Aging_","",x)}))
colnames(ss2_glia.counts) <- as.character(sapply(colnames(ss2_glia.counts),function(x){gsub("Atlas_|Aging_","",x)}))
check
cat("ss2_neur new cell names in allmeta_neur: \n",sum(colnames(ss2_neur.counts) %in% allmeta_ss2_neur$NAME),
"\n\nss2_glia new cell names in allmeta_glia: \n",sum(colnames(ss2_glia.counts) %in% allmeta_ss2_glia$NAME))
## ss2_neur new cell names in allmeta_neur:
## 2657
##
## ss2_glia new cell names in allmeta_glia:
## 3039
actually they have same orders
cat("ss2_neur new cell names = NAME in allmeta_neur: \n",sum(colnames(ss2_neur.counts) == allmeta_ss2_neur$NAME),
"\n\nss2_glia new cell names = NAME in allmeta_glia: \n",sum(colnames(ss2_glia.counts) == allmeta_ss2_glia$NAME))
## ss2_neur new cell names = NAME in allmeta_neur:
## 2657
##
## ss2_glia new cell names = NAME in allmeta_glia:
## 3039
not sure about what's the expression value, TPM or counts? (though named by '.counts' ? the name/process is designed for droplet.)
considering it's RSEM's output which could have 'expected counts' with decimals (normal counts is integer), both could be possible,
and it should be processed for the little difference about nGene
nGene calculated from expression matrix is a little different from nGene in allmeta
guess some very small values of some genes became 0
cat("ss2_neur cells: \n",dim(ss2_neur.counts)[2],
"\n\nss2_neur cells with 'nGene >=1000': \n",sum(colSums(ss2_neur.counts>0) >= 1000))
## ss2_neur cells:
## 2657
##
## ss2_neur cells with 'nGene >=1000':
## 2656
cat("ss2_glia cells: \n",dim(ss2_glia.counts)[2],
"\n\nss2_glia cells with 'nGene >=1000': \n",sum(colSums(ss2_glia.counts>0) >= 1000))
## ss2_glia cells:
## 3039
##
## ss2_glia cells with 'nGene >=1000':
## 3036
and what's confused next is that there's a 'nUMI' column in allmeta_ss2
but as we know Smart-Seq2 technology is not so possible to contain UMI...
cat("total expression value of some ss2_neur cells: \n\n");colSums(ss2_neur.counts[,1500:1505])
## total expression value of some ss2_neur cells:
## M22_S71.S5 M22_S71.S18 M22_S71.S30 M22_S71.S7 M22_S72.S161 M22_S72.S139
## 1210587.6 1053291.2 1069731.4 616457.4 1412823.1 1106437.9
how about their 'nUMI' in meta data
allmeta_ss2_neur[1500:1505,"nUMI",drop=F]
## nUMI
## M22_S71.S5 1210587.56
## M22_S71.S18 1053291.16
## M22_S71.S30 1069731.41
## M22_S71.S7 616457.44
## M22_S72.S161 1412823.05
## M22_S72.S139 1106437.9
cat("total expression value of some ss2_glia cells: \n\n");colSums(ss2_glia.counts[,1400:1405])
## total expression value of some ss2_glia cells:
## M12_S34.S377 M12_S34.S336 M12_S34.S308 M12_S34.S314 M12_S34.S315 M12_S34.S290
## 891648.1 1132662.9 588916.9 400811.4 781963.6 959742.6
how about their 'nUMI' in meta data
allmeta_ss2_glia[1400:1405,"nUMI",drop=F]
## nUMI
## M12_S34.S377 891648.05
## M12_S34.S336 1132662.92
## M12_S34.S308 588916.89
## M12_S34.S314 400811.42
## M12_S34.S315 781963.58
## M12_S34.S290 959742.57
could see that so-called 'nUMI' is indeed sum of expression values, but not the same across samples
although in paper, ss2 always use TPM or TP10K,
here we could just regard the expression value as reads count
and without additional raw data, impossible to turn it into TPM,
so, we could just normalize it to CPM or say CP10k
# norm_f <- 1e6
# have tried 1e6, it got much flatter and longer groups in tsne image after sub-clustering, no other considerations here
norm_f <- 1e4
ss2_neur.counts <- sweep(norm_f*ss2_neur.counts,2,as.numeric(allmeta_ss2_neur$nUMI), FUN='/')
ss2_glia.counts <- sweep(norm_f*ss2_glia.counts,2,as.numeric(allmeta_ss2_glia$nUMI), FUN='/')
# _n as log2(CP10K+1)
ss2_neur.counts_n <- log2(ss2_neur.counts+1)
ss2_glia.counts_n <- log2(ss2_glia.counts+1)
normalize total expression of each cell to 10k
cat("normalized total expression value of some ss2_neur cells: \n\n");colSums(ss2_neur.counts[,1500:1505])
## normalized total expression value of some ss2_neur cells:
## M22_S71.S5 M22_S71.S18 M22_S71.S30 M22_S71.S7 M22_S72.S161 M22_S72.S139
## 10000 10000 10000 10000 10000 10000
cat("normalized total expression value of some ss2_glia cells: \n\n");colSums(ss2_glia.counts[,1400:1405])
## normalized total expression value of some ss2_glia cells:
## M12_S34.S377 M12_S34.S336 M12_S34.S308 M12_S34.S314 M12_S34.S315 M12_S34.S290
## 10000 10000 10000 10000 10000 10000
check marker genes of neuron/glia from Lasrado et al. 2017
Neurons
Tubb3, Elavl4, Ret, Phox2b, Chrnb4, Eml5, Smpd3, Tagln3, Snap25, Gpr22, Gdap1l1, Stmn3,
Chrna3, Scg3, Syt4, Ncan, Crmp1, Adcyap1r1, Elavl3, Dlg2, Cacna2d(Cacna2d1/2/3/4 which ?)
Glia
Erbb3, Sox10, Fabp3, Plp1, Gas7, Nid1, Qk, Sparc, Mest, Nfia, Wwtr1, Gpm6b, Rasa3,
Flrt1, Itpripl1, Itga4, Lama4, Postn, Ptprz1, Pdpn, Col18a1, Nrcam
genes_neur <- c("Tubb3", "Elavl4", "Ret", "Phox2b", "Chrnb4", "Eml5", "Smpd3",
"Tagln3", "Snap25", "Gpr22", "Gdap1l1", "Stmn3", "Chrna3", "Scg3",
"Syt4", "Ncan", "Crmp1", "Adcyap1r1", "Elavl3", "Dlg2","Cacna2d1", "Cacna2d2")
# "Syt4", "Ncan", "Crmp1", "Adcyap1r1", "Elavl3", "Dlg2","Cacna2d1", "Cacna2d2", "Cacna2d3", "Cacna2d4")
genes_glia <- c("Erbb3", "Sox10", "Fabp3", "Plp1", "Gas7", "Nid1", "Qk", "Sparc", "Mest", "Nfia", "Wwtr1",
"Gpm6b", "Rasa3", "Flrt1", "Itpripl1", "Itga4", "Lama4", "Postn", "Ptprz1", "Pdpn", "Col18a1", "Nrcam")
pheatmap::pheatmap(cbind(ss2_neur.counts_n[c(genes_neur,genes_glia),],ss2_glia.counts_n[c(genes_neur,genes_glia),]),
cluster_rows = F,cluster_cols = F, show_colnames = F, fontsize_row = 7.2,
gaps_col = dim(ss2_neur.counts_n)[2], gaps_row = length(genes_neur),
main = "Heatmap: log2(CP10K+1) expression of neur/glia marker genes\nleft: ss2_neur, right: ss2_glia;\n up: genes_neur, down: genes_glia")
# convert to signature score
sig_score_neur <- cbind(ss2_neur.counts_n[c(genes_neur),],ss2_glia.counts_n[c(genes_neur),])
sig_score_glia <- cbind(ss2_neur.counts_n[c(genes_glia),],ss2_glia.counts_n[c(genes_glia),])
sig_score_neur <- t(scale(t(sig_score_neur), center=FALSE))
sig_score_glia <- t(scale(t(sig_score_glia), center=FALSE))
how about scaled value
pheatmap::pheatmap(rbind(sig_score_neur,sig_score_glia),
cluster_rows = F,cluster_cols = F, show_colnames = F, fontsize_row = 7.2,
gaps_col = dim(ss2_neur.counts_n)[2], gaps_row = length(genes_neur),
main = "Heatmap: scaled log2(CP10K+1) expression of neur/glia marker genes\nleft: ss2_neur, right: ss2_glia;\n up: genes_neur, down: genes_glia")
# then take mean
sig_score_neur <- colMeans(sig_score_neur)
sig_score_glia <- colMeans(sig_score_glia)
pheatmap::pheatmap(rbind(sig_score_neur,sig_score_glia),
cluster_rows = F,cluster_cols = F, show_colnames = F,
gaps_col = dim(ss2_neur.counts_n)[2], gaps_row = 1,
main = "Heatmap: signature score of neur/glia marker genes\nleft: ss2_neur, right: ss2_glia;\n up: genes_neur, down: genes_glia")
in general, ss2_neur and _glia are well separated by those marker genes
is there any cell different/opposite ?
cat("ss2_neur cells: \n",dim(ss2_neur.counts)[2],
"\n\nglia-like cells in ss2_neur: \n",sum(sig_score_neur[1:dim(ss2_neur.counts_n)[2]] < sig_score_glia[1:dim(ss2_neur.counts_n)[2]]),
"\n\n\nss2_glia cells: \n",dim(ss2_glia.counts)[2],
"\n\nneur-like cells in ss2_glia: \n",sum(sig_score_neur[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])] >
sig_score_glia[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])]))
## ss2_neur cells:
## 2657
##
## glia-like cells in ss2_neur:
## 0
##
##
## ss2_glia cells:
## 3039
##
## neur-like cells in ss2_glia:
## 15
what are those cells
wrsig_cells <- rbind(sig_score_neur,sig_score_glia)[,c(sig_score_neur[1:dim(ss2_neur.counts_n)[2]] < sig_score_glia[1:dim(ss2_neur.counts_n)[2]],
sig_score_neur[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])] >
sig_score_glia[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])])]
cat("neur-like ss2_glia :\n\n");t(wrsig_cells)
## neur-like ss2_glia :
## sig_score_neur sig_score_glia
## M10_S25.S287 0.3956897 0.35361891
## M12_S31.S54 0.1935581 0.13908701
## M13_S35.S34 0.8062678 0.75337060
## M17_S52.S143 1.0703043 0.51475028
## M19_S59.S201 1.1289381 0.22542524
## M22_S67.S3 0.6895019 0.14439987
## M22_S67.S74 0.6765199 0.49102940
## M22_S68.S134 1.2611510 0.03127052
## M22_S73.S222 0.9624907 0.07234676
## M22_S73.S251 1.0419958 0.05229566
## M22_S73.S201 0.8180435 0.47197097
## M24_S79.S28 0.8825665 0.04041496
## M24_S80.S130 0.3680673 0.04114909
## M25_S83.S59 0.7483502 0.57859943
## M25_S85.S253 1.2196015 0.32383591
pheatmap::pheatmap(wrsig_cells,
cluster_rows = F,cluster_cols = F, show_colnames = T,
display_numbers = T,
main = "Heatmap: signature score of neur-like ss2_glia")
pheatmap::pheatmap(cbind(ss2_neur.counts_n[c(genes_neur,genes_glia),],ss2_glia.counts_n[c(genes_neur,genes_glia),])[,colnames(wrsig_cells)],
cluster_rows = F,cluster_cols = F, show_colnames = T, fontsize_row = 6.8,
display_numbers = F, gaps_row = length(genes_neur),
main = "Heatmap: log2(CP10K+1) of neur/glia marker genes for neur-like ss2_glia\n up: genes_neur, down: genes_glia")
in 15 neur-like ss2_glia, could see about 6-7 have either very low or very close neur/glia signature score
what caused this may be i.Immune labeling, ii.FACS sorting or even iii.the List of marker genes for neur/glia
here would keep those cells, but make a mark for them
colnames(wrsig_cells)
## [1] "M10_S25.S287" "M12_S31.S54" "M13_S35.S34" "M17_S52.S143" "M19_S59.S201"
## [6] "M22_S67.S3" "M22_S67.S74" "M22_S68.S134" "M22_S73.S222" "M22_S73.S251"
## [11] "M22_S73.S201" "M24_S79.S28" "M24_S80.S130" "M25_S83.S59" "M25_S85.S253"
allmeta add neur/glia sig_score
#
allmeta_ss2 <- cbind(allmeta_ss2,t(rbind(sig_score_neur,sig_score_glia))[rownames(allmeta_ss2),])
allmeta_ss2[,c("sig_score_neur","sig_score_glia")] <- round(allmeta_ss2[,c("sig_score_neur","sig_score_glia")],2)
# repartition neur/glia
allmeta_ss2_neur <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric neurons (Smart-Seq2)"),]
allmeta_ss2_glia <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)"),]
check meta data of neur-like ss2_glia
allmeta_ss2[colnames(wrsig_cells),]
## NAME Annotation Dataset
## M10_S25.S287 M10_S25.S287 Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M12_S31.S54 M12_S31.S54 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M13_S35.S34 M13_S35.S34 Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M17_S52.S143 M17_S52.S143 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M19_S59.S201 M19_S59.S201 Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M22_S67.S3 M22_S67.S3 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S67.S74 M22_S67.S74 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S68.S134 M22_S68.S134 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S73.S222 M22_S73.S222 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S73.S251 M22_S73.S251 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S73.S201 M22_S73.S201 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M24_S79.S28 M24_S79.S28 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M24_S80.S130 M24_S80.S130 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M25_S83.S59 M25_S83.S59 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M25_S85.S253 M25_S85.S253 Glia_2 Mouse colon enteric glia (Smart-Seq2)
## Mouse_ID Unique_ID nGene nUMI Age Age_Processed Model
## M10_S25.S287 M10 S25 5199 1156647.53 12 Young Sox10
## M12_S31.S54 M12 S31 2805 241888.82 12 Young Sox10
## M13_S35.S34 M13 S35 6526 1643994.84 12 Young Sox10
## M17_S52.S143 M17 S52 8069 789865.28 12 Young Uchl1
## M19_S59.S201 M19 S59 4346 613243.6 12 Young Uchl1
## M22_S67.S3 M22 S67 2758 26048.76 12 Young Uchl1
## M22_S67.S74 M22 S67 5529 355083.05 12 Young Uchl1
## M22_S68.S134 M22 S68 4123 723618.93 12 Young Uchl1
## M22_S73.S222 M22 S73 4407 860896.83 12 Young Uchl1
## M22_S73.S251 M22 S73 5105 950884.65 12 Young Uchl1
## M22_S73.S201 M22 S73 6631 705714.29 12 Young Uchl1
## M24_S79.S28 M24 S79 5519 650119.54 12 Young Uchl1
## M24_S80.S130 M24 S80 5599 608444.38 12 Young Uchl1
## M25_S83.S59 M25 S83 7141 816792.66 11 Young Uchl1
## M25_S85.S253 M25 S85 7618 1158638.51 11 Young Uchl1
## Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M10_S25.S287 Sox10 3 Distal M M 7PM
## M12_S31.S54 Sox10 1 Proximal M M 7PM
## M13_S35.S34 Sox10 1 Proximal M M 7PM
## M17_S52.S143 Uchl1 2 Proximal M M 7AM
## M19_S59.S201 Uchl1 1 Proximal M M 7AM
## M22_S67.S3 Uchl1 1 Proximal M M 7PM
## M22_S67.S74 Uchl1 1 Proximal M M 7PM
## M22_S68.S134 Uchl1 2 Proximal M M 7PM
## M22_S73.S222 Uchl1 3 Distal M M 7PM
## M22_S73.S251 Uchl1 3 Distal M M 7PM
## M22_S73.S201 Uchl1 3 Distal M M 7PM
## M24_S79.S28 Uchl1 1 Proximal F F 7PM
## M24_S80.S130 Uchl1 2 Proximal F F 7PM
## M25_S83.S59 Uchl1 1 Proximal F F 7AM
## M25_S85.S253 Uchl1 2 Proximal F F 7AM
## Time_Processed sig_score_neur sig_score_glia
## M10_S25.S287 7PM 0.40 0.35
## M12_S31.S54 7PM 0.19 0.14
## M13_S35.S34 7PM 0.81 0.75
## M17_S52.S143 7AM 1.07 0.51
## M19_S59.S201 7AM 1.13 0.23
## M22_S67.S3 7PM 0.69 0.14
## M22_S67.S74 7PM 0.68 0.49
## M22_S68.S134 7PM 1.26 0.03
## M22_S73.S222 7PM 0.96 0.07
## M22_S73.S251 7PM 1.04 0.05
## M22_S73.S201 7PM 0.82 0.47
## M24_S79.S28 7PM 0.88 0.04
## M24_S80.S130 7PM 0.37 0.04
## M25_S83.S59 7AM 0.75 0.58
## M25_S85.S253 7AM 1.22 0.32
(detected genes with expression)
plot(as.numeric(allmeta_ss2$nGene), type = "p", pch = 1,ylab="nGene")
abline(h=mean(as.numeric(allmeta_ss2$nGene)),col=6)
abline(h=mean(as.numeric(allmeta_ss2_neur$nGene)),col="green3")
abline(h=mean(as.numeric(allmeta_ss2_glia$nGene)),col="blue3")
title(paste0("mean nGene for ss2 neuron(left,green) and glia(right,blue) "))
neur and glia are naturally partitioned by nGene !
but couldn't do it this way with mixed neur+glia.
h <- hist(as.numeric(allmeta_ss2$nGene), breaks = 20,xlab="", main="", xlim = c(0,12000), ylim = c(0,800), plot = T)
h1 <- hist(as.numeric(allmeta_ss2_neur$nGene), breaks = 20, xlab="", main="", xlim = c(0,12000), ylim = c(0,800), plot = F)
h2 <- hist(as.numeric(allmeta_ss2_glia$nGene), breaks = 20, xlab="", main="", xlim = c(0,12000), ylim = c(0,800), plot = F)
d <- density(as.numeric(allmeta_ss2$nGene))
d1 <- density(as.numeric(allmeta_ss2_neur$nGene))
d2 <- density(as.numeric(allmeta_ss2_glia$nGene))
#plot(h,add=T, col="grey", plot = F)
#abline(v=mean(as.numeric(allmeta_ss2$nGene)),col=6)
title(paste0("nGene distribtusion for ss2 neuron(green) and glia(blue) "))
#lines(x = d$x, y=d$y* length(as.numeric(allmeta_ss2$nGene))* diff(h$breaks)[1],col="grey",lwd=2, xlim = c(0,12000), ylim = c(0,800))
lines(x = d1$x, y=d1$y* length(as.numeric(allmeta_ss2_neur$nGene))* diff(h1$breaks)[1],col="green3",lwd=3, xlim = c(0,12000), ylim = c(0,800))
lines(x = d2$x, y=d2$y* length(as.numeric(allmeta_ss2_glia$nGene))* diff(h2$breaks)[1],col="blue3",lwd=3, xlim = c(0,12000), ylim = c(0,800))
plot(h,add=F, col="grey", xlab="", main="")
title(paste0("nGene distribtusion for mixed ss2 neuron+glia "))
lines(x = d$x, y=d$y* length(as.numeric(allmeta_ss2$nGene))* diff(h$breaks)[1],col=6, lwd=2, xlim = c(0,12000), ylim = c(0,800))
variable genes are important in batch correction and clustering.
in paper, they calculate top 1500 vargenes for each batch, merge and sort by repeating times, take top 1500 again as new vargenes
also need to 'remove HLA, immunoglobulin, ribosomal, and mitochondrial genes based on HUGO gene names', with:
var_regex = '^HLA-|^IG[HJKL]|^RNA|^MT|^RP'
here comes two questions, the first one could have big effect on sub-clustering.
to partition ss2_neur and _glia, these could be not so important,
because differences between neur and glia are big enough to ignore other effects like batch effect;
but to sub-cluster ss2_neur or _glia, batch effect could be larger than putative biological effect,
without correction the result might be first clustered by batch effects then by biological ones in each batch.
could be 'one Mouse' or 'one Sample' or some real batches to do experiment.
guess they might do several mice at once, but we could only try to select first two kinds without additional information.
using "Sample_ID" as batch info, more batches could have very small number of cells,
and would get mixed sub-clustering results comparing to inpaper clusters (allmeta$Annotation), especially ss2_glia.
finally choose the first kind, i.g. one unique 'Mouse_ID' as one batch
if one batch only has one cell, the combat correction step would use 'mean only' method instead of considering mean and variance of cells
if one batch only has very small number of cells, it might also have bad effect on this correction (of which I'm not so sure yet)
in the previous Cell paper of Chris's (example code), they merge all small batches with cells less than 50 as one new batch, but after trying a range of 'min_cells' to merge, I just set a relatively small value to merge as few mice as possible .
just basically use the related gene names:
var_regex='^Cd|^Ig|^Rn|^mt|^Rp'
another set of genes may have effect is cell cycle related (not mentioned in paper),
have tested that. g2m genes enrich in 3-4 ss2_neur sub-clusters,
but without cellcycle genes, only one sub-cluster of PIMNs dispeared,
so here also do clustering without consideration of cellcycle's effect.
using 'Sample_ID' as batch info
cat("profiled ss2_mix in each Mouse-Sample: \n");table(allmeta_ss2$Unique_ID)[paste0("S",1:102)]
## profiled ss2_mix in each Mouse-Sample:
##
## S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 S11 S12 S13 S14 S15 S16
## 24 25 20 74 68 46 67 42 39 52 77 41 32 65 55 49
## S17 S18 S19 S20 S21 S22 S23 S24 S25 S26 S27 S28 S29 S30 S31 S32
## 47 78 69 65 53 68 74 62 73 44 80 74 74 56 50 69
## S33 S34 S35 S36 S37 S38 S39 S40 S41 S42 S43 S44 S45 S46 S47 S48
## 65 58 72 62 68 58 1 1 5 2 3 4 77 80 61 59
## S49 S50 S51 S52 S53 S54 S55 S56 S57 S58 S59 S60 S61 S62 S63 S64
## 37 26 57 65 20 22 77 76 84 82 59 22 56 82 76 63
## S65 S66 S67 S68 S69 S70 S71 S72 S73 S74 S75 S76 S77 S78 S79 S80
## 41 46 78 81 86 82 86 77 81 87 78 68 73 87 83 79
## S81 S82 S83 S84 S85 S86 S87 S88 S89 S90 S91 S92 S93 S94 S95 S96
## 77 79 72 71 74 82 17 51 10 2 55 63 26 38 28 61
## S97 S98 S99 S100 S101 S102
## 44 39 63 61 38 40
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2$Unique_ID),probs=seq(0,1,0.05))
##
## quantile statistics:
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60%
## 1.00 4.05 20.20 26.00 38.00 41.00 46.00 51.35 56.00 59.00 62.00 65.00 68.00
## 65% 70% 75% 80% 85% 90% 95% 100%
## 70.30 73.70 75.50 77.00 78.85 81.00 82.95 87.00
actually one sample is more like one batch as samples have more average cell number,
another choice might could be to merge neighboring mouse/sample with small number of cells
using 'Mouse_ID' as batch info
cat("profiled ss2_mix in each Mouse: \n");table(allmeta_ss2$Mouse_ID)[paste0("M",1:29)]
## profiled ss2_mix in each Mouse:
##
## M1 M2 M3 M4 M5 M6 M7 M8 M9 M10 M11 M12 M13 M14 M15 M16 M17 M18 M19 M20
## 24 25 20 74 68 46 415 229 255 253 284 242 260 2 14 340 164 319 81 138
## M21 M22 M23 M24 M25 M26 M27 M28 M29
## 226 658 306 318 299 80 182 172 202
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2$Mouse_ID),probs=seq(0,1,0.05))
##
## quantile statistics:
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60%
## 2.0 16.4 23.2 29.2 59.2 74.0 80.4 126.6 165.6 178.0 202.0 227.2 239.4
## 65% 70% 75% 80% 85% 90% 95% 100%
## 253.4 258.0 284.0 301.8 315.6 323.2 385.0 658.0
check min_cells to choose to merge batches
min_cells <- 20
cat("ss2_mix mice: ",length(table(allmeta_ss2$Mouse_ID)),
"\nss2_mix count: ",sum(table(allmeta_ss2$Mouse_ID)),
paste0("\n\nmin_cell < ",min_cells," : ",
"\n mice ",sum(table(allmeta_ss2$Mouse_ID) < min_cells),
"\n count ",sum((table(allmeta_ss2$Mouse_ID))[table(allmeta_ss2$Mouse_ID) < min_cells])))
## ss2_mix mice: 29
## ss2_mix count: 5696
##
## min_cell < 20 :
## mice 2
## count 16
modified parameters for Phenograph in paper:
PCs 1-16, 'k'NN 250
#ss2_mix.counts <- RowMergeSparseMatrices(ss2_neur.counts, ss2_glia.counts)
#
batch_use.ss2_mix <- as.character(allmeta_ss2$Mouse_ID)
names(batch_use.ss2_mix) <- rownames(allmeta_ss2)
batch_use.ss2_mix <- factor(batch_use.ss2_mix)
#ss2_mix.seur <- run_analysis(name="ss2_mix",
# counts=ss2_mix.counts,
# do.batch=TRUE,
# batch.use=batch_use.ss2_mix,
# min_cells=20,
# var_regex='^Cd|^Ig|^Rn|^mt|^Rp',
# num_pcs=30,
# clust_pcs=16,
# k=250)
#
#ss2_mix.seur@meta.data <- cbind(ss2_mix.seur@meta.data, allmeta_ss2[rownames(ss2_mix.seur@meta.data),])
#saveRDS(ss2_mix.seur,"ss2_mix.seur.rds")
because this kind of step takes time and everytime to run would get a little different result,
here just use same parameters to run once then annotate the code, next always use the saved seurat object for analysis
ss2_mix.seur <- readRDS("ss2_mix.seur.rds")
head(ss2_mix.seur@meta.data)
## orig.ident nCount_RNA nFeature_RNA phenograph.k250 NAME
## M1_S1.S339 ss2_mix 10000.000 7060 3 M1_S1.S339
## M1_S1.S351 ss2_mix 10000.000 7418 3 M1_S1.S351
## M1_S1.S363 ss2_mix 9999.865 7672 3 M1_S1.S363
## M1_S1.S349 ss2_mix 9999.957 8016 3 M1_S1.S349
## M1_S1.S313 ss2_mix 9999.972 6983 3 M1_S1.S313
## M1_S1.S289 ss2_mix 10000.000 7945 5 M1_S1.S289
## Annotation Dataset Mouse_ID
## M1_S1.S339 PIMN_4 Mouse colon enteric neurons (Smart-Seq2) M1
## M1_S1.S351 PIMN_4 Mouse colon enteric neurons (Smart-Seq2) M1
## M1_S1.S363 PIMN_4 Mouse colon enteric neurons (Smart-Seq2) M1
## M1_S1.S349 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M1
## M1_S1.S313 PIMN_4 Mouse colon enteric neurons (Smart-Seq2) M1
## M1_S1.S289 PIN_1 Mouse colon enteric neurons (Smart-Seq2) M1
## Unique_ID nGene nUMI Age Age_Processed Model Model_Processed
## M1_S1.S339 S1 7060 1777551.53 12 Young Sox10 Sox10
## M1_S1.S351 S1 7419 1435340.23 12 Young Sox10 Sox10
## M1_S1.S363 S1 7674 1483155.55 12 Young Sox10 Sox10
## M1_S1.S349 S1 8018 1505426.6 12 Young Sox10 Sox10
## M1_S1.S313 S1 6985 1412616.17 12 Young Sox10 Sox10
## M1_S1.S289 S1 7945 1210014.21 12 Young Sox10 Sox10
## Segment Segment_Processed Sex Sex_Processed Time Time_Processed
## M1_S1.S339 All <NA> F F <NA> <NA>
## M1_S1.S351 All <NA> F F <NA> <NA>
## M1_S1.S363 All <NA> F F <NA> <NA>
## M1_S1.S349 All <NA> F F <NA> <NA>
## M1_S1.S313 All <NA> F F <NA> <NA>
## M1_S1.S289 All <NA> F F <NA> <NA>
## sig_score_neur sig_score_glia inlocal inpaper
## M1_S1.S339 0.55 0.02 ss2_neur ss2_neur
## M1_S1.S351 0.99 0.07 ss2_neur ss2_neur
## M1_S1.S363 0.98 0.13 ss2_neur ss2_neur
## M1_S1.S349 0.97 0.04 ss2_neur ss2_neur
## M1_S1.S313 0.79 0.19 ss2_neur ss2_neur
## M1_S1.S289 0.88 0.03 ss2_neur ss2_neur
ElbowPlot(ss2_mix.seur,ndims = 30) + labs(title = "PCs range 1-30, take 1-16")
PC1&2
DimPlot(ss2_mix.seur, reduction = 'pca', dims = c(1,2)) + labs(title = "ss2_mix PC1+2 ")
DimPlot(ss2_mix.seur, reduction = 'pca', dims = c(1,2), group.by = 'Dataset') + labs(title = "ss2_mix PC1+2 with ss2_neur/glia labeling ")
DimPlot(ss2_mix.seur, reduction = 'tsne') + labs(title = "ss2_mix tSNE")
DimPlot(ss2_mix.seur, reduction = 'tsne', group.by = 'Dataset' ) + labs(title = "ss2_mix tSNE with ss2_neur/glia labeling")
Phenograph clusters
DimPlot(ss2_mix.seur, group.by = 'phenograph.k250', label = T) + labs(title = "ss2_mix Phenograph output")
check neur/glia sig_score of each cluster
pheno_clust <- ss2_mix.seur@meta.data[c("phenograph.k250","sig_score_neur","sig_score_glia","Dataset")]
pheno_clust <- group_by(pheno_clust, phenograph.k250)
pheno_clust <- summarise(pheno_clust,
count = n(),
neur_labeled = sum(Dataset=="Mouse colon enteric neurons (Smart-Seq2)"),
glia_labeled = sum(Dataset=="Mouse colon enteric glia (Smart-Seq2)"),
max_sig_neur = max(sig_score_neur),
max_sig_glia = max(sig_score_glia),
inlocal = if(max_sig_neur > max_sig_glia){"neuron"}else{"glia"})
data.frame(pheno_clust)
## phenograph.k250 count neur_labeled glia_labeled max_sig_neur max_sig_glia
## 1 0 1721 23 1698 1.65 1.80
## 2 1 718 0 718 0.88 1.99
## 3 2 637 17 620 1.79 1.81
## 4 3 632 632 0 1.83 0.79
## 5 4 628 628 0 1.96 0.74
## 6 5 584 584 0 1.93 0.85
## 7 6 408 407 1 1.89 0.86
## 8 7 368 366 2 2.02 0.63
## inlocal
## 1 glia
## 2 glia
## 3 glia
## 4 neuron
## 5 neuron
## 6 neuron
## 7 neuron
## 8 neuron
here we also get a few 'neur-like glia' or 'glia-like neur'
(I used to use "Sample_ID" as batch, do 'mean only' correction, and take top 1500 variable genes from whole dataset,
then only get 9 neur-like which all appear in 'marker genes' part upstairs different from labelings,
but these unmodified parameters failed in sub-clustering, so here just stay the same with next step)
('taking top 1500 vargenes from whole' and 'top 1500 from each batch, merging and taking sorted new top 1500 as vargenes' would have less than half genes overlap,
depending on 'Mouse or Sample to choose as one batch' and 'neur/glia or mixed')
# new batch info
var_c_mix <- as.factor(batch_use.ss2_mix)
levels(var_c_mix)[table(var_c_mix) < min_cells] <- "merge"
# top 1500 from whole once
ss2_mix_var <- get_var_genes(ss2_mix.seur@assays[['RNA']]@data, samples=ss2_mix.seur$orig.ident, num_genes = 1500,min_cells = 20)
# top 1500 from each, merge, then new top 1500 from sorted by repeating times
ss2_mix_var_c <- get_var_genes(ss2_mix.seur@assays[['RNA']]@data, samples = var_c_mix, num_genes = 1500,min_cells = 20)
cat("ss2_mix vargenes: 1500\noverlap of 'sorted merged from each batch' and 'from whole': ",
sum(ss2_mix_var %in% ss2_mix_var_c),
"\nspecific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): ",length(grep('^Cd|^Ig|^Rn|^mt|^Rp', ss2_mix_var_c)))
## ss2_mix vargenes: 1500
## overlap of 'sorted merged from each batch' and 'from whole': 752
## specific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): 38
data.frame(pheno_clust)
## phenograph.k250 count neur_labeled glia_labeled max_sig_neur max_sig_glia
## 1 0 1721 23 1698 1.65 1.80
## 2 1 718 0 718 0.88 1.99
## 3 2 637 17 620 1.79 1.81
## 4 3 632 632 0 1.83 0.79
## 5 4 628 628 0 1.96 0.74
## 6 5 584 584 0 1.93 0.85
## 7 6 408 407 1 1.89 0.86
## 8 7 368 366 2 2.02 0.63
## inlocal
## 1 glia
## 2 glia
## 3 glia
## 4 neuron
## 5 neuron
## 6 neuron
## 7 neuron
## 8 neuron
could see pheno_clust '0' and '2' may be non-ideal to use max sig_score as a standard to decide glia/neuron,
though the final 'inlocal' cluster is 'right'
neur vs glia
1.65 vs 1.80
1.79 vs 1.81
check those cells different from neur/glia labelings
highest neur scores (1.65, 1.79) in glia are indeed from them
ss2_mix.seur@meta.data %>% filter(phenograph.k250 %in% c("0","2") & Dataset == "Mouse colon enteric neurons (Smart-Seq2)")
## orig.ident nCount_RNA nFeature_RNA phenograph.k250 NAME
## M17_S52.S162 ss2_mix 10000.000 7199 0 M17_S52.S162
## M17_S52.S189 ss2_mix 9999.986 4253 0 M17_S52.S189
## M19_S60.S294 ss2_mix 9999.819 11523 2 M19_S60.S294
## M22_S67.S64 ss2_mix 10000.000 5677 0 M22_S67.S64
## M22_S67.S68 ss2_mix 9999.966 5633 2 M22_S67.S68
## M22_S67.S72 ss2_mix 9999.848 7342 0 M22_S67.S72
## M22_S68.S184 ss2_mix 10000.000 8370 0 M22_S68.S184
## M22_S68.S113 ss2_mix 10000.000 8608 0 M22_S68.S113
## M22_S70.S314 ss2_mix 9999.981 4537 2 M22_S70.S314
## M22_S70.S351 ss2_mix 10000.000 3806 2 M22_S70.S351
## M22_S70.S308 ss2_mix 9999.839 5553 0 M22_S70.S308
## M22_S71.S74 ss2_mix 9999.975 5579 2 M22_S71.S74
## M22_S71.S25 ss2_mix 9993.599 6246 0 M22_S71.S25
## M22_S71.S63 ss2_mix 9999.894 7596 0 M22_S71.S63
## M22_S72.S164 ss2_mix 10000.000 5235 2 M22_S72.S164
## M22_S72.S145 ss2_mix 10000.000 2741 2 M22_S72.S145
## M22_S73.S257 ss2_mix 9999.764 5945 0 M22_S73.S257
## M22_S73.S193 ss2_mix 10000.000 4954 0 M22_S73.S193
## M22_S74.S306 ss2_mix 10000.000 3120 2 M22_S74.S306
## M24_S79.S74 ss2_mix 10000.000 6703 0 M24_S79.S74
## M24_S80.S181 ss2_mix 10000.000 6094 0 M24_S80.S181
## M24_S80.S136 ss2_mix 10000.000 6686 0 M24_S80.S136
## M24_S80.S125 ss2_mix 10000.000 5053 0 M24_S80.S125
## M24_S80.S106 ss2_mix 9999.140 6707 0 M24_S80.S106
## M24_S80.S105 ss2_mix 10000.000 3031 0 M24_S80.S105
## M24_S80.S103 ss2_mix 10000.000 4786 0 M24_S80.S103
## M24_S80.S172 ss2_mix 9999.773 6151 0 M24_S80.S172
## M24_S80.S113 ss2_mix 9999.984 5009 0 M24_S80.S113
## M24_S80.S151 ss2_mix 10000.000 7006 2 M24_S80.S151
## M24_S81.S211 ss2_mix 9999.964 5268 2 M24_S81.S211
## M24_S82.S380 ss2_mix 10000.000 5795 2 M24_S82.S380
## M24_S82.S298 ss2_mix 10000.000 7534 0 M24_S82.S298
## M24_S82.S305 ss2_mix 10000.000 4622 2 M24_S82.S305
## M24_S82.S329 ss2_mix 10000.000 6585 2 M24_S82.S329
## M25_S83.S51 ss2_mix 9986.817 7383 2 M25_S83.S51
## M25_S83.S1 ss2_mix 9999.443 7337 2 M25_S83.S1
## M25_S83.S6 ss2_mix 9999.978 7874 2 M25_S83.S6
## M25_S84.S163 ss2_mix 10000.000 4423 2 M25_S84.S163
## M25_S84.S97 ss2_mix 9999.288 4948 0 M25_S84.S97
## M25_S85.S255 ss2_mix 9999.120 9038 0 M25_S85.S255
## Annotation Dataset Mouse_ID
## M17_S52.S162 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M17
## M17_S52.S189 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M17
## M19_S60.S294 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M19
## M22_S67.S64 PIMN_6 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S67.S68 PSVN_2 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S67.S72 PIMN_7 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S68.S184 PIMN_7 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S68.S113 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S70.S314 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S70.S351 PIMN_3 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S70.S308 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S71.S74 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S71.S25 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S71.S63 PIMN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S72.S164 PIMN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S72.S145 PSN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S73.S257 PIMN_3 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S73.S193 PIMN_4 Mouse colon enteric neurons (Smart-Seq2) M22
## M22_S74.S306 PIMN_1 Mouse colon enteric neurons (Smart-Seq2) M22
## M24_S79.S74 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S181 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S136 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S125 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S106 PIMN_6 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S105 PIMN_6 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S103 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S172 PIMN_5 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S113 PIMN_2 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S80.S151 PEMN_1 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S81.S211 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S82.S380 PIMN_3 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S82.S298 PIMN_3 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S82.S305 PIMN_3 Mouse colon enteric neurons (Smart-Seq2) M24
## M24_S82.S329 PIMN_1 Mouse colon enteric neurons (Smart-Seq2) M24
## M25_S83.S51 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M25
## M25_S83.S1 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M25
## M25_S83.S6 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M25
## M25_S84.S163 PSN_1 Mouse colon enteric neurons (Smart-Seq2) M25
## M25_S84.S97 PSVN_1 Mouse colon enteric neurons (Smart-Seq2) M25
## M25_S85.S255 PIMN_1 Mouse colon enteric neurons (Smart-Seq2) M25
## Unique_ID nGene nUMI Age Age_Processed Model Model_Processed
## M17_S52.S162 S52 7199 758737.96 12 Young Uchl1 Uchl1
## M17_S52.S189 S52 4254 689184.7 12 Young Uchl1 Uchl1
## M19_S60.S294 S60 11526 1818436.95 12 Young Uchl1 Uchl1
## M22_S67.S64 S67 5677 274866.18 12 Young Uchl1 Uchl1
## M22_S67.S68 S67 5634 292735.49 12 Young Uchl1 Uchl1
## M22_S67.S72 S67 7343 263703.56 12 Young Uchl1 Uchl1
## M22_S68.S184 S68 8370 962725.77 12 Young Uchl1 Uchl1
## M22_S68.S113 S68 8608 693794.32 12 Young Uchl1 Uchl1
## M22_S70.S314 S70 4538 528050.12 12 Young Uchl1 Uchl1
## M22_S70.S351 S70 3806 342232.35 12 Young Uchl1 Uchl1
## M22_S70.S308 S70 5555 807938.5 12 Young Uchl1 Uchl1
## M22_S71.S74 S71 5580 643098 12 Young Uchl1 Uchl1
## M22_S71.S25 S71 6249 1010790.31 12 Young Uchl1 Uchl1
## M22_S71.S63 S71 7598 751938.2 12 Young Uchl1 Uchl1
## M22_S72.S164 S72 5236 954096.53 12 Young Uchl1 Uchl1
## M22_S72.S145 S72 2741 557430.18 12 Young Uchl1 Uchl1
## M22_S73.S257 S73 5947 591157.85 12 Young Uchl1 Uchl1
## M22_S73.S193 S73 4954 975702.49 12 Young Uchl1 Uchl1
## M22_S74.S306 S74 3120 295490.49 12 Young Uchl1 Uchl1
## M24_S79.S74 S79 6703 637414.81 12 Young Uchl1 Uchl1
## M24_S80.S181 S80 6094 570961.05 12 Young Uchl1 Uchl1
## M24_S80.S136 S80 6686 839451.89 12 Young Uchl1 Uchl1
## M24_S80.S125 S80 5053 507444.64 12 Young Uchl1 Uchl1
## M24_S80.S106 S80 6710 697658.82 12 Young Uchl1 Uchl1
## M24_S80.S105 S80 3031 26887.15 12 Young Uchl1 Uchl1
## M24_S80.S103 S80 4786 172529.5 12 Young Uchl1 Uchl1
## M24_S80.S172 S80 6153 703297.68 12 Young Uchl1 Uchl1
## M24_S80.S113 S80 5010 487671.81 12 Young Uchl1 Uchl1
## M24_S80.S151 S80 7006 791904 12 Young Uchl1 Uchl1
## M24_S81.S211 S81 5269 557690.12 12 Young Uchl1 Uchl1
## M24_S82.S380 S82 5795 786853.47 12 Young Uchl1 Uchl1
## M24_S82.S298 S82 7534 835317.43 12 Young Uchl1 Uchl1
## M24_S82.S305 S82 4622 438829.78 12 Young Uchl1 Uchl1
## M24_S82.S329 S82 6585 715351.04 12 Young Uchl1 Uchl1
## M25_S83.S51 S83 7390 921148.6 11 Young Uchl1 Uchl1
## M25_S83.S1 S83 7340 699579.13 11 Young Uchl1 Uchl1
## M25_S83.S6 S83 7875 908440.68 11 Young Uchl1 Uchl1
## M25_S84.S163 S84 4423 784851.73 12 Young Uchl1 Uchl1
## M25_S84.S97 S84 4951 393257.21 12 Young Uchl1 Uchl1
## M25_S85.S255 S85 9039 932142.48 11 Young Uchl1 Uchl1
## Segment Segment_Processed Sex Sex_Processed Time Time_Processed
## M17_S52.S162 2 Proximal M M 7AM 7AM
## M17_S52.S189 2 Proximal M M 7AM 7AM
## M19_S60.S294 4 Distal M M 7AM 7AM
## M22_S67.S64 1 Proximal M M 7PM 7PM
## M22_S67.S68 1 Proximal M M 7PM 7PM
## M22_S67.S72 1 Proximal M M 7PM 7PM
## M22_S68.S184 2 Proximal M M 7PM 7PM
## M22_S68.S113 2 Proximal M M 7PM 7PM
## M22_S70.S314 4 Distal M M 7PM 7PM
## M22_S70.S351 4 Distal M M 7PM 7PM
## M22_S70.S308 4 Distal M M 7PM 7PM
## M22_S71.S74 1 Proximal M M 7PM 7PM
## M22_S71.S25 1 Proximal M M 7PM 7PM
## M22_S71.S63 1 Proximal M M 7PM 7PM
## M22_S72.S164 2 Proximal M M 7PM 7PM
## M22_S72.S145 2 Proximal M M 7PM 7PM
## M22_S73.S257 3 Distal M M 7PM 7PM
## M22_S73.S193 3 Distal M M 7PM 7PM
## M22_S74.S306 4 Distal M M 7PM 7PM
## M24_S79.S74 1 Proximal F F 7PM 7PM
## M24_S80.S181 2 Proximal F F 7PM 7PM
## M24_S80.S136 2 Proximal F F 7PM 7PM
## M24_S80.S125 2 Proximal F F 7PM 7PM
## M24_S80.S106 2 Proximal F F 7PM 7PM
## M24_S80.S105 2 Proximal F F 7PM 7PM
## M24_S80.S103 2 Proximal F F 7PM 7PM
## M24_S80.S172 2 Proximal F F 7PM 7PM
## M24_S80.S113 2 Proximal F F 7PM 7PM
## M24_S80.S151 2 Proximal F F 7PM 7PM
## M24_S81.S211 3 Distal F F 7PM 7PM
## M24_S82.S380 4 Distal F F 7PM 7PM
## M24_S82.S298 4 Distal F F 7PM 7PM
## M24_S82.S305 4 Distal F F 7PM 7PM
## M24_S82.S329 4 Distal F F 7PM 7PM
## M25_S83.S51 1 Proximal F F 7AM 7AM
## M25_S83.S1 1 Proximal F F 7AM 7AM
## M25_S83.S6 1 Proximal F F 7AM 7AM
## M25_S84.S163 2 Proximal F F 7AM 7AM
## M25_S84.S97 2 Proximal F F 7AM 7AM
## M25_S85.S255 2 Proximal F F 7AM 7AM
## sig_score_neur sig_score_glia inlocal inpaper
## M17_S52.S162 0.76 0.27 ss2_glia ss2_neur
## M17_S52.S189 0.75 0.34 ss2_glia ss2_neur
## M19_S60.S294 1.79 0.22 ss2_glia ss2_neur
## M22_S67.S64 0.97 0.04 ss2_glia ss2_neur
## M22_S67.S68 0.69 0.09 ss2_glia ss2_neur
## M22_S67.S72 1.65 0.17 ss2_glia ss2_neur
## M22_S68.S184 1.30 0.17 ss2_glia ss2_neur
## M22_S68.S113 1.02 0.29 ss2_glia ss2_neur
## M22_S70.S314 1.01 0.14 ss2_glia ss2_neur
## M22_S70.S351 1.52 0.07 ss2_glia ss2_neur
## M22_S70.S308 0.84 0.02 ss2_glia ss2_neur
## M22_S71.S74 0.79 0.01 ss2_glia ss2_neur
## M22_S71.S25 0.74 0.20 ss2_glia ss2_neur
## M22_S71.S63 1.04 0.10 ss2_glia ss2_neur
## M22_S72.S164 1.33 0.27 ss2_glia ss2_neur
## M22_S72.S145 0.86 0.00 ss2_glia ss2_neur
## M22_S73.S257 1.25 0.04 ss2_glia ss2_neur
## M22_S73.S193 1.01 0.17 ss2_glia ss2_neur
## M22_S74.S306 1.11 0.00 ss2_glia ss2_neur
## M24_S79.S74 1.02 0.18 ss2_glia ss2_neur
## M24_S80.S181 0.61 0.17 ss2_glia ss2_neur
## M24_S80.S136 0.54 0.06 ss2_glia ss2_neur
## M24_S80.S125 0.91 0.03 ss2_glia ss2_neur
## M24_S80.S106 0.51 0.06 ss2_glia ss2_neur
## M24_S80.S105 0.85 0.16 ss2_glia ss2_neur
## M24_S80.S103 0.56 0.01 ss2_glia ss2_neur
## M24_S80.S172 0.64 0.09 ss2_glia ss2_neur
## M24_S80.S113 0.89 0.27 ss2_glia ss2_neur
## M24_S80.S151 0.80 0.03 ss2_glia ss2_neur
## M24_S81.S211 0.38 0.04 ss2_glia ss2_neur
## M24_S82.S380 1.63 0.11 ss2_glia ss2_neur
## M24_S82.S298 1.64 0.14 ss2_glia ss2_neur
## M24_S82.S305 1.26 0.03 ss2_glia ss2_neur
## M24_S82.S329 1.75 0.24 ss2_glia ss2_neur
## M25_S83.S51 0.89 0.17 ss2_glia ss2_neur
## M25_S83.S1 0.92 0.21 ss2_glia ss2_neur
## M25_S83.S6 1.16 0.13 ss2_glia ss2_neur
## M25_S84.S163 1.40 0.11 ss2_glia ss2_neur
## M25_S84.S97 0.82 0.07 ss2_glia ss2_neur
## M25_S85.S255 1.51 0.21 ss2_glia ss2_neur
ss2_mix.seur@meta.data %>% filter(phenograph.k250 %in% c("6","7") & Dataset == "Mouse colon enteric glia (Smart-Seq2)")
## orig.ident nCount_RNA nFeature_RNA phenograph.k250 NAME
## M8_S16.S132 ss2_mix 10000.000 6315 6 M8_S16.S132
## M22_S73.S222 ss2_mix 9999.988 4406 7 M22_S73.S222
## M28_S96.S169 ss2_mix 10000.000 5196 7 M28_S96.S169
## Annotation Dataset Mouse_ID
## M8_S16.S132 Glia_2 Mouse colon enteric glia (Smart-Seq2) M8
## M22_S73.S222 Glia_2 Mouse colon enteric glia (Smart-Seq2) M22
## M28_S96.S169 Glia_2 Mouse colon enteric glia (Smart-Seq2) M28
## Unique_ID nGene nUMI Age Age_Processed Model Model_Processed
## M8_S16.S132 S16 6315 1038469.5 12 Young Sox10 Sox10
## M22_S73.S222 S73 4407 860896.83 12 Young Uchl1 Uchl1
## M28_S96.S169 S96 5196 732008.35 105 Old Sox10 Sox10
## Segment Segment_Processed Sex Sex_Processed Time Time_Processed
## M8_S16.S132 2 Proximal F F 7PM 7PM
## M22_S73.S222 3 Distal M M 7PM 7PM
## M28_S96.S169 2 Proximal M M 7AM 7AM
## sig_score_neur sig_score_glia inlocal inpaper
## M8_S16.S132 0.28 0.73 ss2_neur ss2_glia
## M22_S73.S222 0.96 0.07 ss2_neur ss2_glia
## M28_S96.S169 0.31 0.63 ss2_neur ss2_glia
except a small part have low or close neur/glia score, not sure about why others get into a cluster which is different from its labeling.
here just make a mark for them, sub-clustering would still use initially labeled ss2_neur/glia as input instead of this one.
inlocal final result for mixed ss2_neur/glia
ss2_mix.seur@meta.data$inlocal <- ss2_mix.seur@meta.data$phenograph.k250
for(i in 1:length(ss2_mix.seur@meta.data$inlocal)){
if(ss2_mix.seur@meta.data$inlocal[i] %in% c("0","1","2")){
ss2_mix.seur@meta.data$inlocal[i] <- "ss2_glia"
}else{
ss2_mix.seur@meta.data$inlocal[i] <- "ss2_neur"
}
}
DimPlot(ss2_mix.seur, group.by = 'inlocal') + labs(title = "ss2_mix inlocal")
inpaper's
ss2_mix.seur@meta.data$inpaper <- ss2_mix.seur@meta.data$Dataset
for(i in 1:length(ss2_mix.seur@meta.data$inpaper)){
if(ss2_mix.seur@meta.data$inpaper[i] %in% c("Mouse colon enteric glia (Smart-Seq2)")){
ss2_mix.seur@meta.data$inpaper[i] <- "ss2_glia"
}else{
ss2_mix.seur@meta.data$inpaper[i] <- "ss2_neur"
}
}
DimPlot(ss2_mix.seur, group.by = 'inpaper') + labs(title = "ss2_mix inpaper")
check main division marker genes of neur
three pairs, all devide neur into two parts,
could see Chat-Nos1 works well for ss2_neur uniquely, the other two pairs have more or less expression on glia
FeaturePlot(ss2_mix.seur, features = c("Chat","Nos1","Gfra2","Gfra1","Casz1","Etv1"))
check given marker genes of glia,
could see these three genes faintly devide glia into two or three parts from top-right to down-left,
but not so clear comparing to neur
FeaturePlot(ss2_mix.seur, features = c("Gfra2","Slc18a2","Ntsr1"))
#saveRDS(ss2_mix.seur,"ss2_mix.seur.rds")
#rm(ss2_mix.seur,ss2_mix.counts)
check how many min_cells to choose to merge batch
cat("profiled ss2_neur in each Mouse: \n");table(allmeta_ss2_neur$Mouse_ID)[paste0("M",1:29)]
## profiled ss2_neur in each Mouse:
##
## M1 M2 M3 M4 M5 M6 M7 M8 M9 M10 M11 M12 M13 M14 M15 M16 M17 M18 M19 M20
## 10 4 4 23 14 15 103 56 71 60 76 56 71 1 9 73 162 119 80 26
## M21 M22 M23 M24 M25 M26 M27 M28 M29
## 61 651 104 316 296 21 51 66 58
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2_neur$Mouse_ID),probs=seq(0,1,0.05))
##
## quantile statistics:
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60%
## 1.0 4.0 8.0 10.8 14.6 21.0 24.2 46.0 56.0 57.2 60.0 63.0 70.0
## 65% 70% 75% 80% 85% 90% 95% 100%
## 71.4 74.8 80.0 103.4 116.0 188.8 308.0 651.0
min_cells <- 5
cat("ss2_neur mice: ",length(table(allmeta_ss2_neur$Mouse_ID)),
"\nss2_neur count: ",sum(table(allmeta_ss2_neur$Mouse_ID)),
paste0("\nmin_cell < ",min_cells," : "),
"\n mice ",sum(table(allmeta_ss2_neur$Mouse_ID) < min_cells),
"\n count ",sum((table(allmeta_ss2_neur$Mouse_ID))[table(allmeta_ss2_neur$Mouse_ID) < min_cells]))
## ss2_neur mice: 29
## ss2_neur count: 2657
## min_cell < 5 :
## mice 3
## count 9
cat("profiled ss2_glia in each Mouse: \n");table(allmeta_ss2_glia$Mouse_ID)[paste0("M",1:29)]
## profiled ss2_glia in each Mouse:
##
## M1 M2 M3 M4 M5 M6 M7 M8 M9 M10 M11 M12 M13 M14 M15 M16 M17 M18 M19 M20
## 14 21 16 51 54 31 312 173 184 193 208 186 189 1 5 267 2 200 1 112
## M21 M22 M23 M24 M25 M26 M27 M28 M29
## 165 7 202 2 3 59 131 106 144
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2_glia$Mouse_ID),probs=seq(0,1,0.05))
##
## quantile statistics:
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60%
## 1.0 1.4 2.0 3.4 6.2 14.0 18.0 29.0 51.6 57.0 106.0 119.6 141.4
## 65% 70% 75% 80% 85% 90% 95% 100%
## 166.6 179.6 186.0 190.6 198.6 203.2 243.4 312.0
min_cells <- 5
cat("ss2_glia mice: ",length(table(allmeta_ss2_glia$Mouse_ID)),
"\nss2_glia count: ",sum(table(allmeta_ss2_glia$Mouse_ID)),
paste0("\nmin_cell < ",min_cells," : "),
"\n mice ",sum(table(allmeta_ss2_glia$Mouse_ID) < min_cells),
"\n count ",sum((table(allmeta_ss2_glia$Mouse_ID))[table(allmeta_ss2_glia$Mouse_ID) < min_cells]))
## ss2_glia mice: 29
## ss2_glia count: 3039
## min_cell < 5 :
## mice 5
## count 9
modified parameters for Phenograph in paper:
ss2_neur PCs 30 1-15, 'k'NN 15
ss2_glia PCs 15 1-9, 'k'NN 500
# ss2_neur
batch_use.ss2_neur <- as.character(allmeta_ss2_neur$Mouse_ID)
names(batch_use.ss2_neur) <- rownames(allmeta_ss2_neur)
batch_use.ss2_neur <- factor(batch_use.ss2_neur)
#ss2_neur.seur <- run_analysis(name="ss2_neur",
# counts=ss2_neur.counts,
# do.batch=TRUE,
# batch.use=batch_use.ss2_neur,
# min_cells=5,
# var_regex='^Cd|^Ig|^Rn|^mt|^Rp',
# num_pcs=30,
# clust_pcs=15,
# k=15)
#ss2_neur.seur@meta.data <- cbind(ss2_neur.seur@meta.data, allmeta_ss2_neur[rownames(ss2_neur.seur@meta.data),])
# ss2_glia
batch_use.ss2_glia <- as.character(allmeta_ss2_glia$Mouse_ID)
names(batch_use.ss2_glia) <- rownames(allmeta_ss2_glia)
batch_use.ss2_glia <- factor(batch_use.ss2_glia)
#ss2_glia.seur <- run_analysis(name="ss2_glia",
# counts=ss2_glia.counts,
# do.batch=TRUE,
# batch.use=batch_use.ss2_glia,
# min_cells=5,
# var_regex='^Cd|^Ig|^Rn|^mt|^Rp',
# num_pcs=15,
# clust_pcs=9,
# k=500)
#ss2_glia.seur@meta.data <- cbind(ss2_glia.seur@meta.data, allmeta_ss2_glia[rownames(ss2_glia.seur@meta.data),])
#saveRDS(ss2_neur.seur,"ss2_neur.seur.rds")
#saveRDS(ss2_glia.seur,"ss2_glia.seur.rds")
because this kind of step takes time and everytime to run would get a little different result,
here just use same parameters to run once then annotate the code, next always use the saved seurat object for analysis
#head(ss2_neur.seur@meta.data)
#head(ss2_glia.seur@meta.data)
here's a small bug for ss2_glia I have never solved after running this for so many times,
ss2_glia.seur has an orig.ident and active.ident named by "Mouse_Sample" from its sample names instead of specified 'name' in the script,
but with the same code, ss2_neur.seur's org.ident and active.ident are always the value of 'name',
this only affects the plotting, guess there's sth. different between ss2_neur/glia mtx files, just fix it
#ss2_glia.seur@active.ident <- ss2_glia.seur@meta.data$orig.ident <- factor("ss2_glia")
#saveRDS(ss2_neur.seur,"ss2_neur.seur.rds")
#saveRDS(ss2_glia.seur,"ss2_glia.seur.rds")
ss2_neur.seur <- readRDS("ss2_neur.seur.rds")
ss2_glia.seur <- readRDS("ss2_glia.seur.rds")
check variable genes to use
# new batch info
var_c_neur <- as.factor(batch_use.ss2_neur)
levels(var_c_neur)[table(var_c_neur) < min_cells] <- "merge"
# top 1500 from whole once
ss2_neur_var <- get_var_genes(ss2_neur.seur@assays[['RNA']]@data, samples=ss2_neur.seur$orig.ident, num_genes = 1500,min_cells = 5)
# top 1500 from each, merge, then new top 1500 from sorted by repeating times
ss2_neur_var_c <- get_var_genes(ss2_neur.seur@assays[['RNA']]@data, samples = var_c_neur, num_genes = 1500,min_cells = 5)
cat("ss2_neur vargenes: 1500\noverlap of 'sorted merged from each batch' and 'from whole': ",
sum(ss2_neur_var %in% ss2_neur_var_c),
"\nspecific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): ",length(grep('^Cd|^Ig|^Rn|^mt|^Rp', ss2_neur_var_c)))
## ss2_neur vargenes: 1500
## overlap of 'sorted merged from each batch' and 'from whole': 887
## specific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): 32
# new batch info
var_c_glia <- as.factor(batch_use.ss2_glia)
levels(var_c_glia)[table(var_c_glia) < min_cells] <- "merge"
# top 1500 from whole once
ss2_glia_var <- get_var_genes(ss2_glia.seur@assays[['RNA']]@data, samples=ss2_glia.seur$orig.ident, num_genes = 1500,min_cells = 5)
# top 1500 from each, merge, then new top 1500 from sorted by repeating times
ss2_glia_var_c <- get_var_genes(ss2_glia.seur@assays[['RNA']]@data, samples = var_c_glia, num_genes = 1500,min_cells = 5)
cat("ss2_glia vargenes: 1500\noverlap of 'sorted merged from each batch' and 'from whole': ",
sum(ss2_glia_var %in% ss2_glia_var_c),
"\nspecific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): ",length(grep('^Cd|^Ig|^Rn|^mt|^Rp', ss2_glia_var_c)))
## ss2_glia vargenes: 1500
## overlap of 'sorted merged from each batch' and 'from whole': 811
## specific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): 31
here comes a question I haven't explored the answers.
who not directly use variable genes or even corrected batches from ss2_mix's ?
here just restart those steps for sub-clustering.
check results
ElbowPlot(ss2_neur.seur,ndims = 30) + labs(title = "ss2_neur PCs, take 1-15")
PC1&2
DimPlot(ss2_neur.seur, reduction = 'pca', dims = c(1,2)) + labs(title = "ss2_neur PC1+2 ")
DimPlot(ss2_neur.seur, reduction = 'pca', dims = c(1,2), group.by = 'Annotation') + labs(title = "ss2_neur PC1+2 with inpaper annotations")
DimPlot(ss2_neur.seur, reduction = 'tsne') + labs(title = "ss2_neur tSNE")
DimPlot(ss2_neur.seur, reduction = 'tsne', group.by = 'Annotation',label = T ) + labs(title = "ss2_neur tSNE with inpaper annotations")
Phenograph clusters
DimPlot(ss2_neur.seur, group.by = 'phenograph.k15', label = T) + labs(title = "ss2_neur Phenograph output")
could see ss2_neur get 21 (inpaper) + 4 more (inlocal) = 25 sub-clusters.
how to really annotate them need to combine considerations of 'modification of clustering parameters' and 'marker genes of each sub-cluster relating to biological background',
it shouldn't be an easy part, but here we just do it in an easy way to annotate each cluster for a similar result like inpaper's
pheno_neur <- ss2_neur.seur@meta.data[c('phenograph.k15','Annotation')]
pheno_neur <- group_by(pheno_neur, phenograph.k15)
pheno_neur <- summarise(pheno_neur,
count=n(),
PEMN_1_inpaper = sum(Annotation == 'PEMN_1'),
PEMN_2_inpaper = sum(Annotation == 'PEMN_2'),
PEMN_3_inpaper = sum(Annotation == 'PEMN_3'),
PEMN_4_inpaper = sum(Annotation == 'PEMN_4'),
PEMN_5_inpaper = sum(Annotation == 'PEMN_5'),
PIMN_1_inpaper = sum(Annotation == 'PIMN_1'),
PIMN_2_inpaper = sum(Annotation == 'PIMN_2'),
PIMN_3_inpaper = sum(Annotation == 'PIMN_3'),
PIMN_4_inpaper = sum(Annotation == 'PIMN_4'),
PIMN_5_inpaper = sum(Annotation == 'PIMN_5'),
PIMN_6_inpaper = sum(Annotation == 'PIMN_6'),
PIMN_7_inpaper = sum(Annotation == 'PIMN_7'),
PIN_1_inpaper = sum(Annotation == 'PIN_1'),
PIN_2_inpaper = sum(Annotation == 'PIN_2'),
PIN_3_inpaper = sum(Annotation == 'PIN_3'),
PSN_1_inpaper = sum(Annotation == 'PSN_1'),
PSN_2_inpaper = sum(Annotation == 'PSN_2'),
PSN_3_inpaper = sum(Annotation == 'PSN_3'),
PSN_4_inpaper = sum(Annotation == 'PSN_4'),
PSVN_1_inpaper = sum(Annotation == 'PSVN_1'),
PSVN_2_inpaper = sum(Annotation == 'PSVN_2'))
#pheno_neur
pheno_neur <- rbind(pheno_neur,c("count",colSums(pheno_neur[,2:ncol(pheno_neur)])))
pheno_neur <- data.frame(t(pheno_neur))[c(1,3:23,2),][,c(order(as.numeric(pheno_neur$phenograph.k15[1:25])),26)]
colnames(pheno_neur) <- c(paste0("pheno_",pheno_neur[1,1:ncol(pheno_neur)-1]),"count")
pheno_neur <- pheno_neur[2:nrow(pheno_neur),]
pheno_neur
## pheno_0 pheno_1 pheno_2 pheno_3 pheno_4 pheno_5 pheno_6 pheno_7
## PEMN_1_inpaper 0 0 0 0 0 0 0 101
## PEMN_2_inpaper 0 0 0 0 0 0 1 21
## PEMN_3_inpaper 0 191 0 1 0 1 21 1
## PEMN_4_inpaper 0 12 0 0 0 0 45 0
## PEMN_5_inpaper 0 7 0 0 0 0 57 0
## PIMN_1_inpaper 0 0 0 0 3 33 0 0
## PIMN_2_inpaper 21 0 0 0 118 38 0 0
## PIMN_3_inpaper 0 0 0 0 6 65 0 0
## PIMN_4_inpaper 198 0 23 0 53 1 0 0
## PIMN_5_inpaper 13 0 168 5 5 0 0 0
## PIMN_6_inpaper 0 0 7 189 6 1 0 0
## PIMN_7_inpaper 1 0 0 1 0 0 0 0
## PIN_1_inpaper 0 0 0 0 0 0 0 0
## PIN_2_inpaper 0 0 0 0 0 0 0 0
## PIN_3_inpaper 0 0 0 0 0 0 0 0
## PSN_1_inpaper 0 0 0 0 0 0 0 0
## PSN_2_inpaper 0 0 0 0 0 0 0 0
## PSN_3_inpaper 0 0 0 0 0 0 0 0
## PSN_4_inpaper 0 20 0 0 0 0 2 0
## PSVN_1_inpaper 1 0 0 0 0 6 0 0
## PSVN_2_inpaper 0 0 1 0 0 0 0 0
## count 234 230 199 196 191 145 126 123
## pheno_8 pheno_9 pheno_10 pheno_11 pheno_12 pheno_13 pheno_14
## PEMN_1_inpaper 0 0 0 0 4 1 0
## PEMN_2_inpaper 0 0 0 0 47 0 0
## PEMN_3_inpaper 0 0 0 0 0 0 0
## PEMN_4_inpaper 0 0 0 0 18 0 0
## PEMN_5_inpaper 0 0 0 2 19 0 0
## PIMN_1_inpaper 0 0 37 0 0 0 1
## PIMN_2_inpaper 0 0 0 0 0 0 0
## PIMN_3_inpaper 0 0 0 0 0 0 0
## PIMN_4_inpaper 0 0 0 0 0 0 0
## PIMN_5_inpaper 0 0 3 0 0 0 0
## PIMN_6_inpaper 0 0 58 0 0 0 0
## PIMN_7_inpaper 0 0 1 0 0 0 0
## PIN_1_inpaper 68 0 0 0 0 1 64
## PIN_2_inpaper 0 0 0 0 0 27 0
## PIN_3_inpaper 0 0 0 0 0 58 0
## PSN_1_inpaper 0 0 0 2 0 1 0
## PSN_2_inpaper 0 0 0 0 0 0 0
## PSN_3_inpaper 38 0 0 0 0 0 18
## PSN_4_inpaper 0 0 0 92 0 0 0
## PSVN_1_inpaper 0 19 0 0 0 0 0
## PSVN_2_inpaper 0 80 0 0 0 0 0
## count 106 99 99 96 88 88 83
## pheno_15 pheno_16 pheno_17 pheno_18 pheno_19 pheno_20 pheno_21
## PEMN_1_inpaper 0 22 0 0 0 0 0
## PEMN_2_inpaper 0 5 0 0 0 3 0
## PEMN_3_inpaper 7 1 0 2 0 18 0
## PEMN_4_inpaper 0 49 0 0 0 3 0
## PEMN_5_inpaper 0 4 0 0 0 30 0
## PIMN_1_inpaper 0 0 0 0 0 0 28
## PIMN_2_inpaper 0 0 0 0 0 0 0
## PIMN_3_inpaper 0 0 0 0 0 0 10
## PIMN_4_inpaper 0 0 0 0 0 0 0
## PIMN_5_inpaper 0 0 0 0 0 0 0
## PIMN_6_inpaper 0 0 0 0 0 0 0
## PIMN_7_inpaper 0 0 0 0 0 0 0
## PIN_1_inpaper 0 0 0 0 0 0 0
## PIN_2_inpaper 0 0 78 0 0 0 0
## PIN_3_inpaper 0 0 1 0 0 0 0
## PSN_1_inpaper 0 0 0 0 51 0 0
## PSN_2_inpaper 0 0 0 0 5 0 0
## PSN_3_inpaper 0 0 0 0 0 0 0
## PSN_4_inpaper 75 0 0 0 0 1 0
## PSVN_1_inpaper 0 0 0 72 0 0 4
## PSVN_2_inpaper 0 0 0 0 0 0 0
## count 82 81 79 74 56 55 42
## pheno_22 pheno_23 pheno_24 count
## PEMN_1_inpaper 4 0 0 132
## PEMN_2_inpaper 28 0 0 105
## PEMN_3_inpaper 0 0 0 243
## PEMN_4_inpaper 0 0 0 127
## PEMN_5_inpaper 0 0 0 119
## PIMN_1_inpaper 0 0 0 102
## PIMN_2_inpaper 0 0 0 177
## PIMN_3_inpaper 0 0 0 81
## PIMN_4_inpaper 0 0 0 275
## PIMN_5_inpaper 0 1 0 195
## PIMN_6_inpaper 0 6 0 267
## PIMN_7_inpaper 0 20 0 23
## PIN_1_inpaper 0 0 0 133
## PIN_2_inpaper 0 0 0 105
## PIN_3_inpaper 1 0 0 60
## PSN_1_inpaper 0 0 2 56
## PSN_2_inpaper 0 0 23 28
## PSN_3_inpaper 0 0 0 56
## PSN_4_inpaper 0 0 0 190
## PSVN_1_inpaper 0 0 0 102
## PSVN_2_inpaper 0 0 0 81
## count 33 27 25 2657
pheatmap::pheatmap(pheno_neur %>% mutate_all(as.numeric), cluster_rows = F, cluster_cols = F, breaks = seq(0,300,3),
display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_neur))
check for each sub-clusters_inpaper
PEMNs_inpaper: totoally separated from other groups except to share pheno "1" (20, 10%) with PSN_4, but get mixed inside, _3 share a little "1" with _4/5, _2/4/5 share "12", so do pheno "16","20","6","7"
PIMNs_inpaper: similarly mixed inside, share pheno "0","10","2","23","3-5"
PINS_inpaper: mainly contain pheno "13" (_2/3 share),"14","17","8"
PSNs_inpaper: mainly contain pheno "11","15","19","24", _4 take some "1" from PEMN_3, _3 take some "14" and "8" from PIN_1
PSVNs_inpaper: mainly contain pheno "18" and "9"(_1 19, _2 80)
then we have
PEMNs_inlocal: pheno "7","12","22","1","16","20","6" as PEMN_1-7
PIMNs_inlocal: pheno "21","4","5","0","2","3","10","23" as PIMN_1-8
PINs_inlocal: pheno "8","14","17","13" as PIN_1-4
PSNs_inlocal: pheno "19","24","11","15" as PSN_1-4
PSVNs_inlocal: pheno "18","9" as PSVN_1-4
this might be laboursome to do it manually,
one way to do it automatically could be sorting the max count for each one to get a result like the heatmap below
(however still a cheap way to simulate a similar result but not for initial annotations)
pheatmap::pheatmap(pheno_neur[,c(paste0("pheno_",c(7,12,22,1,16,20,6,21,4,5,0,2,3,10,23,8,14,17,13,19,24,11,15,18,9)),"count")] %>%
mutate_all(as.numeric),
cluster_rows = F, cluster_cols = F, breaks = seq(0,300,3),
display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_neur))
then we have
subclust_neur <- data.frame(inlocal=paste0(c(paste0("PEMN_",1:7),paste0("PIMN_",1:8),paste0("PIN_",1:4),paste0("PSN_",1:4),paste0("PSVN_",1:2))),
raw=paste0("pheno_",c(7,12,22,1,16,20,6,21,4,5,0,2,3,10,23,8,14,17,13,19,24,11,15,18,9)))
pheatmap::pheatmap(pheno_neur[,c(subclust_neur$raw,"count")] %>%
mutate_all(as.numeric),
cluster_rows = F, cluster_cols = F, breaks = seq(0,300,3),
display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_neur),
labels_col = c(paste0(subclust_neur$inlocal,"_inlocal"),"count"),
gaps_col = c(7,15,19,23,25), gaps_row = c(5,12,15,19,21))
#write.table(pheno_neur,"pheno_neur.ss2.csv",col.names = TRUE,row.names = TRUE,sep = ",",quote = F)
label inlocal ss2_neur sub-clusters
ss2_neur.seur@meta.data$inlocal <- paste0("pheno_",ss2_neur.seur@meta.data$phenograph.k15)
for(i in 1:length(ss2_neur.seur@meta.data$inlocal)){
ss2_neur.seur@meta.data$inlocal[i] <- subclust_neur$inlocal[subclust_neur$raw == ss2_neur.seur@meta.data$inlocal[i]]
}
DimPlot(ss2_neur.seur, group.by = 'inlocal', label = T) + labs(title = "ss2_neur inlocal")
for ss2_neur sub-clustering, comparing to inpaper annotations,
we get a generally separated result between groups, but broadly mixed within one group, and partly mixed across some groups
DimPlot(ss2_neur.seur, group.by = 'Annotation', label = T) + labs(title = "ss2_neur inpaper")
marker pairs for main division into two groups
FeaturePlot(ss2_neur.seur, features = c("Chat","Nos1","Gfra2","Gfra1","Casz1","Etv1"))
group by conditions
multiplot(
DimPlot(ss2_neur.seur, group.by = 'Age',order=(c("NA","11","12","52","104","105")),
cols = rev(c("#312D8F","#312D8F","#7BAF42","#CB452B","#CB452B"))) +
labs(title = "Age"),
DimPlot(ss2_neur.seur, group.by = 'Segment',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Segment"),
DimPlot(ss2_neur.seur, group.by = 'Model') + labs(title = "Model"),
DimPlot(ss2_neur.seur, group.by = 'nGene', cols = material.heat(1909)) + labs(title = "Number of genes") +
guides(colour = guide_coloursteps(show.limits = TRUE,label=FALSE)),
DimPlot(ss2_neur.seur, group.by = 'Sex') + labs(title = "Sex"),
DimPlot(ss2_neur.seur, group.by = 'Time',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Time"),
cols=3)
by batch(Mouse_ID)
DimPlot(ss2_neur.seur, group.by = 'Mouse_ID', order=paste0("M",as.character(rev(1:29)))) + labs(title = "Mouse") +
theme(legend.text=element_text(size=8))
(other analysis for ss2_neur sub-clusters all in another subpage)
check results
ElbowPlot(ss2_glia.seur,ndims = 15) + labs(title = "ss2_glia PCs, take 1-9")
PC1&2
DimPlot(ss2_glia.seur, reduction = 'pca', dims = c(1,2)) + labs(title = "ss2_glia PC1+2 ")
DimPlot(ss2_glia.seur, reduction = 'pca', dims = c(1,2), group.by = 'Annotation') + labs(title = "ss2_glia PC1+2 with inpaper annotations")
DimPlot(ss2_glia.seur, reduction = 'tsne') + labs(title = "ss2_glia tSNE")
DimPlot(ss2_glia.seur, reduction = 'tsne', group.by = 'Annotation',label = T ) + labs(title = "ss2_glia tSNE with inpaper annotations")
Phenograph clusters
DimPlot(ss2_glia.seur, group.by = 'phenograph.k500', label = T) + labs(title = "ss2_glia Phenograph output")
could see we get a same 3 clusters with a very big 'k' for kNN setting,
one downleft cluster with 99% matched but the other two with 1/5-1/4 mixed
pheno_glia <- ss2_glia.seur@meta.data[c('phenograph.k500','Annotation')]
pheno_glia <- group_by(pheno_glia, phenograph.k500)
pheno_glia <- summarise(pheno_glia,
count=n(),
Glia_1_inpaper = sum(Annotation == 'Glia_1'),
Glia_2_inpaper = sum(Annotation == 'Glia_2'),
Glia_3_inpaper = sum(Annotation == 'Glia_3'))
pheno_glia <- rbind(pheno_glia,c("count",colSums(pheno_glia[,2:ncol(pheno_glia)])))
pheno_glia <- data.frame(t(pheno_glia))[c(1,3:5,2),][,c(order(as.numeric(pheno_glia$phenograph.k500[1:3])),4)]
colnames(pheno_glia) <- c(paste0("pheno_",pheno_glia[1,1:ncol(pheno_glia)-1]),"count")
pheno_glia <- pheno_glia[2:nrow(pheno_glia),]
pheno_glia
## pheno_0 pheno_1 pheno_2 count
## Glia_1_inpaper 22 1009 25 1056
## Glia_2_inpaper 821 54 171 1046
## Glia_3_inpaper 268 11 658 937
## count 1111 1074 854 3039
so, annotate with max count as ss2_neur
pheatmap::pheatmap(pheno_glia[,c(2,1,3,4)] %>% mutate_all(as.numeric), cluster_rows = F, cluster_cols = F, breaks = seq(0,1200,12),
display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_glia))
less clusters so faster to get
pheatmap::pheatmap(pheno_glia[,c(2,1,3,4)] %>% mutate_all(as.numeric), cluster_rows = F, cluster_cols = F, breaks = seq(0,1200,12),
display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_glia),
labels_col = c(paste0("Glia_",1:3,"_inlocal"),"count"))
label inlocal ss2_glia sub-clusters
ss2_glia.seur@meta.data$inlocal <- ss2_glia.seur@meta.data$phenograph.k500
ss2_glia.seur@meta.data$inlocal[ss2_glia.seur@meta.data$inlocal=="1"] <- "Glia_1"
ss2_glia.seur@meta.data$inlocal[ss2_glia.seur@meta.data$inlocal=="0"] <- "Glia_2"
ss2_glia.seur@meta.data$inlocal[ss2_glia.seur@meta.data$inlocal=="2"] <- "Glia_3"
DimPlot(ss2_glia.seur, group.by = 'inlocal', label = T) + labs(title = "ss2_glia inlocal")
for ss2_glia sub-clustering, comparing to inpaper annotations,
we get mostly matched Glia_1, but partly mixed Glia_2/3
DimPlot(ss2_glia.seur, group.by = 'Annotation', label = T) + labs(title = "ss2_glia inpaper")
marker genes
FeaturePlot(ss2_glia.seur, features = c("Gfra2","Slc18a2","Ntsr1"))
group by conditions
multiplot(
DimPlot(ss2_glia.seur, group.by = 'Age',order=(c("NA","11","12","52","104","105")),
cols = rev(c("#312D8F","#312D8F","#7BAF42","#CB452B","#CB452B"))) +
labs(title = "Age"),
DimPlot(ss2_glia.seur, group.by = 'Segment',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Segment"),
DimPlot(ss2_glia.seur, group.by = 'Model') + labs(title = "Model"),
DimPlot(ss2_glia.seur, group.by = 'nGene', cols = material.heat(1909)) + labs(title = "Number of genes") +
guides(colour = guide_coloursteps(show.limits = TRUE,label=FALSE)),
DimPlot(ss2_glia.seur, group.by = 'Sex') + labs(title = "Sex"),
DimPlot(ss2_glia.seur, group.by = 'Time',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Time"),
cols=3)
by batch(Mouse_ID)
DimPlot(ss2_glia.seur, group.by = 'Mouse_ID', order=paste0("M",as.character(rev(1:29)))) + labs(title = "Mouse") +
theme(legend.text=element_text(size=8))
(other analysis for ss2_glia sub-clusters all in another subpage)
#saveRDS(ss2_neur.seur,"ss2_neur.seur.rds")
#saveRDS(ss2_glia.seur,"ss2_glia.seur.rds")